{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>4983</th>\n",
       "      <th>4984</th>\n",
       "      <th>4985</th>\n",
       "      <th>4986</th>\n",
       "      <th>4987</th>\n",
       "      <th>4988</th>\n",
       "      <th>4989</th>\n",
       "      <th>4990</th>\n",
       "      <th>4991</th>\n",
       "      <th>4992</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 4993 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3    4    5    6    7    8    9  ...   4983  4984  4985  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "\n",
       "   4986  4987  4988  4989  4990  4991  4992  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "\n",
       "[5 rows x 4993 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('FE_train_tfidf.csv')\n",
    "Y_data = train['mark']\n",
    "X_data = train.drop([\"mark\"],axis =1 )\n",
    "\n",
    "X_data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       ..., \n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normalize(X_data, norm=\"l2\", copy=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>4983</th>\n",
       "      <th>4984</th>\n",
       "      <th>4985</th>\n",
       "      <th>4986</th>\n",
       "      <th>4987</th>\n",
       "      <th>4988</th>\n",
       "      <th>4989</th>\n",
       "      <th>4990</th>\n",
       "      <th>4991</th>\n",
       "      <th>4992</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4768</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4769</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4770</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4771</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4772</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 4993 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        0    1    2    3    4    5    6    7    8    9  ...   4983  4984  \\\n",
       "4768  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   \n",
       "4769  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   \n",
       "4770  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   \n",
       "4771  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   \n",
       "4772  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   \n",
       "\n",
       "      4985  4986  4987  4988  4989  4990  4991  4992  \n",
       "4768   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4769   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4770   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4771   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4772   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "\n",
       "[5 rows x 4993 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_data.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,using minibatch\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 16.240524703487225\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 15.333786508367828\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 12.104964099782345\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 9.580130213140198\n",
      "K-means begin with clusters: 25\n",
      "CH_score: 8.029093945017086\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 8.415875401389794\n",
      "K-means begin with clusters: 35\n",
      "CH_score: 6.963176654964053\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 6.821076168781553\n",
      "K-means begin with clusters: 45\n",
      "CH_score: 5.427743672664738\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 4.794323987088283\n",
      "K-means begin with clusters: 55\n",
      "CH_score: 4.318643329907371\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 4.480751847520593\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5,10,15, 20,25,30,35,40,45,50,55,60]\n",
    "CH_scores = []\n",
    "#si_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_data)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHHRJREFUeJzt3Xuc1nPex/HXZ5qmlA7SSLQZx7rTQe2lLbVZcijacute\nm0N3klqWXVolclaJsNhlJSlyyioRN91RbCRlIkqRbEopM+4OrENJ3/uP78xWY6aZuU6/6/e73s/H\n43pcc/2un/l9vg7vvr6/7+/7NeccIiISfjlBFyAiIsmhQBcRiQgFuohIRCjQRUQiQoEuIhIRCnQR\nkYhQoIuIRIQCXUQkIhToIiIRkZvOizVu3NgVFBSk85IiIqG3ePHiL51z+ZWdl9ZALygooLCwMJ2X\nFBEJPTNbU5XzNOQiIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIRoUAXEYkIBbqISESEItDnzIF77oHt\n24OuREQkc4Ui0GfMgMsvh5YtYepU2Lkz6IpERDJPpYFuZpPMrMjMlpU5/gcz+9DMPjCzcakrEf76\nV5g1C+rXh7PPho4dYe7cVF5RRCR8qtJDfxjosfsBMzsB6AO0c84dDdyR/NJ2vx6ceiq88w5MmQLF\nxdC9O/TsCe+/n8ori4iER6WB7pybB2wqc/hi4Fbn3LaSc4pSUNtP5ORA//7w0Udwxx2wcCEccwwM\nGABr16ajAhGRzBXvGPpRwC/NbKGZ/cPMjq3oRDMbYmaFZlZYXFwc5+X2VLs2XHEFfPIJDB8OTz0F\nRx3lf968OSmXEBEJnXgDPRdoBHQChgN/NzMr70Tn3ATnXMw5F8vPr3T1x2rZbz+47TZYuRL69YM7\n74TDDoPbb4fvv0/qpUREMl68gb4OeMZ5i4CdQOPklVU9zZvDww/DkiXQuTNceaXvsU+ZAj/+GFRV\nIiLpFW+gPwucAGBmRwF5wJfJKipebdvCiy/6GTBNmvix9Q4d4KWXwLmgqxMRSa2qTFt8ElgAtDCz\ndWY2CJgEHFYylXEqMMC5zInME07wN0ynToV//QtOO83PitHeGiISZZbOHI7FYi7dOxZt3w7jx8Oo\nUfDll36sfcwYP9YuIhIGZrbYORer7LxQPCmaiLw8+OMf/YyYa66B557zT5xedpmfzy4iEhWRD/RS\n9evD6NGwahWcfz7cey8cfrjvrX/7bdDViYgkLmsCvdRBB8GECbB0qR9rv/ZaOOIImDgRduwIujoR\nkfhlXaCXatXKD7+8/joccggMHuxnycycqRkxIhJOWRvopbp2hTffhOnT/Zz1Pn2gWzf48MOgKxMR\nqZ6sD3Twi3+deSYsWwb33w/Ll/s1Y9RTF5EwUaDvpmZNuOgiv/BXYaEfkhERCQsFejn69/dLB1x3\nnTbTEJHwUKCXIzcXbrrJD8E89VTQ1YiIVI0CvQJnnQWtW8MNN2g6o4iEgwK9Ajk5frmAjz+GRx8N\nuhoRkcop0PeiTx+Ixfzwy7ZtQVcjIrJ3CvS9MPPLBaxZAw89FHQ1IiJ7p0CvxCmn+IePRo+G774L\nuhoRkYop0Cth5hfw2rDBP3QkIpKpFOhV0K0bnHwyjB0LX38ddDUiIuVToFfR6NF+g4y//CXoSkRE\nyqdAr6KOHaF3b7j9dti8OehqRER+SoFeDTffDFu3wp13Bl2JiMhPKdCroV07/wTp3Xdr+zoRyTwK\n9Gq66SY/ffG224KuRERkTwr0amrZ0q/GeN998PnnQVcjIrKLAj0O11/vF+waMyboSkREdlGgx+Gw\nw2DQIHjwQfj006CrERHxKg10M5tkZkVmtqyc764wM2dmjVNTXua69lq/IuPNNwddiYiIV5Ue+sNA\nj7IHzexnwCnA2iTXFArNmsHFF8Mjj8DKlUFXIyJShUB3zs0DNpXz1V3AlUDWbqV81VVQuzbceGPQ\nlYiIxDmGbmZ9gPXOufeqcO4QMys0s8LiiE3ebtIELrsMpk6FpUuDrkZEsl21A93M6gAjgeurcr5z\nboJzLuaci+Xn51f3chlv2DCoV8/PfBERCVI8PfTDgUOB98zsU6AZ8I6ZHZjMwsKiUSO44gp49lko\nLAy6GhHJZtUOdOfcUufcAc65AudcAbAO6OCc25j06kLi8sth//39zBcRkaBUZdrik8ACoIWZrTOz\nQakvK1zq14cRI+B//xdefz3oakQkW5lz6ZukEovFXGFExyW+/dY/cNSiBbz2mt/pSEQkGcxssXMu\nVtl5elI0SerUgWuugXnz4JVXgq5GRLKRAj2JhgyBn/3Mj6Wn8X98REQABXpS1arlpy8uWgQvvBB0\nNSKSbRToSTZgABx+OFx3HezcGXQ1IpJNFOhJVrOm3wTjvfdg2rSgqxGRbKJAT4F+/aBVq13rpouI\npIMCPQVq1PDL6n70ETzxRNDViEi2UKCnyJlnQvv2fiXG7duDrkZEsoECPUXMYPRoWL0aJk8OuhoR\nyQYK9BTq2RM6d4ZRo+D774OuRkSiToGeQqW99PXrYfz4oKsRkahToKfYiSf619ix8K9/BV2NiESZ\nAj0NRo2CoiK4996gKxGRKFOgp8Fxx8Fpp8G4cbBlS9DViEhUKdDTZNQo2LwZ7ror6EpEJKoU6GnS\noQP07esD/csvg65GRKJIgZ5GN93kb4yOGxd0JSISRQr0NDr6aDj3XH9zdMOGoKsRkahRoKfZDTf4\npQDGjg26EhGJGgV6mh1xBAwcCA88AGvXBl2NiESJAj0A113n30eNCrYOEYkWBXoAmjeH3/3OL9q1\nalXQ1YhIVCjQA3L11ZCX52e+iIgkgwI9IE2bwqWXwuOPwwcfBF2NiERBpYFuZpPMrMjMlu127HYz\n+9DM3jezGWbWMLVlRtOIEbDvvn7mi4hIoqrSQ38Y6FHm2MtAa+dcW2AlcHWS68oK++8PQ4fC9Onw\n1ltBVyMiYVdpoDvn5gGbyhyb7Zwr3f74LaBZCmrLCldcAc2awaBB2gRDRBKTjDH0C4CXkvB7slL9\n+vDgg7B8ud9YWkQkXgkFupldA+wAHt/LOUPMrNDMCouLixO5XGT16OEfNho3DhYvDroaEQmruAPd\nzM4HegHnOudcRec55yY452LOuVh+fn68l4u8P/8ZmjTxwb59e9DViEgYxRXoZtYDuBLo7Zz7Nrkl\nZaeGDf1yAEuXwpgxQVcjImFUlWmLTwILgBZmts7MBgH3AvWAl81siZlpC+Qk6NULzjsPbrkFliwJ\nuhoRCRvby2hJ0sViMVdYWJi264XRpk3QqpV/8GjRIqhZM+iKRCRoZrbYORer7Dw9KZphGjWC8eN9\nD/3WW4OuRkTCRIGegc44A/r186sxLl0adDUiEhYK9Az117/6G6UDB8KOHZWfLyKiQM9QjRvDfff5\neel33BF0NSISBgr0DPab30Dfvn7xruXLg65GRDKdAj3D3Xcf1KsHF1wAP/4YdDUikskU6BmuSRM/\nnr5wIdx1V9DViEgmU6CHQL9+0KeP34t05cqgqxGRTKVADwEzuP9+2GcfDb2ISMUU6CHRtCncfTfM\nn++HYEREylKgh0j//nD66TByJKxaFXQ1IpJpFOghYuZXZMzL8zsc7dwZdEUikkkU6CFz8MF+7fR5\n8/y4uohIKQV6CA0cCKeeCiNGwOrVQVcjIplCgR5CZjBhAuTkwIUXQhpXQBaRDKZAD6nmzf0aL3Pn\n+nAXEVGgh9jgwdC9OwwfDmvXBl2NiARNgR5iZjBxop/tMniwhl5Esp0CPeQKCuC222D2bJg8Oehq\nRCRICvQIuPhiOP54GDoU1q0LuhoRCYoCPQJycuChh+CHH+B3v9PQi0i2UqBHxOGHw9ix8OKL8Oij\nQVcjIkFQoEfIH/4AXbrAZZfBhg1BVyMi6aZAj5CcHJg0Cb7/Hi66SEMvItmm0kA3s0lmVmRmy3Y7\n1sjMXjazj0ve90ttmVJVRx0Fo0bBzJnw5JNBVyMi6VSVHvrDQI8yx64C5jjnjgTmlHyWDDF0KHTq\n5Idgvvgi6GpEJF0qDXTn3DxgU5nDfYBHSn5+BDgjyXVJAmrU8EMv33wDl1wSdDUiki7xjqE3cc6V\n3nbbCDRJUj2SJP/xH3DjjTB9Ojz9dNDViEg6JHxT1DnngApvv5nZEDMrNLPC4uLiRC8n1TBsGMRi\n8Pvfg/7Wi0RfvIH+hZk1BSh5L6roROfcBOdczDkXy8/Pj/NyEo/cXL8cwNatfjxdRKIt3kCfCQwo\n+XkA8FxyypFka90arr8ennoKZswIuhoRSaWqTFt8ElgAtDCzdWY2CLgVONnMPgZOKvksGWrECGjf\n3q/58n//F3Q1IpIquZWd4Jw7u4Kvuie5FkmRmjX90EssBpdfrqUBRKJKT4pmiXbtYORIeOwxeP75\noKsRkVRQoGeRa66BNm3gggvgvfeCrkZEkk2BnkXy8vyc9Nq1/frp8+cHXZGIJJMCPcu0aAFvvAEH\nHAAnnwwvvRR0RSKSLAr0LHTIIT7UW7SA3r1h6tSgKxKRZFCgZ6kDDoDXXoPOneGcc2D8+KArEpFE\nKdCzWIMGMGsW9Ozp56iPHas11EXCTIGe5erUgWef9b30kSPhyisV6iJhVemDRRJ9NWv6h40aNoQ7\n7oDNm+GBB/wyvCISHgp0Afz2dffeC40awejRsGULPP441KoVdGUiUlUKdPk3M799XaNG8Kc/+VUa\nZ8yAffcNujIRqQqNoctPDB3q136ZOxdOOgk2ld2vSkQykgJdynX++X63o3ff9U+Vfv550BWJSGUU\n6FKhM86AF1+E1auha1f45JOgKxKRvVGgy1517+6HXrZu9aG+dGnQFYlIRRToUqmOHeH11/1MmG7d\nYMGCoCsSkfIo0KVKWrXyqzM2buxvlM6eHXRFIlKWAl2qrKDA99SPOAJ69fJL8YpI5lCgS7UceCD8\n4x9+GKZfP5g4MeiKRKSUAl2qrWFDP+RyyikweDCMGxd0RSICCnSJU5068Nxz8NvfwogRcNVVwS3q\ntXGjnzM/dKifallYGEwdIkHTo/8St7w8v95Lw4Zw221+Ua+//S21i3rt3AkrVvgbtPPn+406/vlP\n/13t2v4PmuOP95t2/PrXqatDJBMp0CUhNWrA/ff79V/GjvWh/thjPuyT4fvv4e23d4X3m2/6a4Df\npKNLF/j97/0c+fbt/TIFv/6176n/5S9wySXJqUMkDBTokjAzuOUWH+rDh8NXX/khkLp1q/+7iot3\n9b7nz/fDJz/84L9r2RL69vUh3qWLn21jtudff+CBfiems8+GSy/1T7mOG+fn0ItEXUKBbmZDgQsB\nBywFBjrnvk9GYRI+w4bBfvvBkCF+A+r/+R//uSLOwcqVu3rf8+f7z+B7+Mce68fFu3b1W+U1bly1\nOurW9atEXn453HknrFkDU6bAPvsk3kaRTBZ3oJvZwcAfgVbOue/M7O9AP+DhJNUmITRokB9TP+cc\nP5Y9e7bvNQNs2waLF+/ZA//yS//d/vv7XvegQf795z/3Y+LxqlHDD7kceqj/g2b9en8TNz8/8TaK\nZKpEh1xygX3M7AegDqA1+YS+feGFF+A//9P3rv/rv3x4v/22D3WAI4/0Dyd16eLPadHip8MniTLz\n67ofcgicdx4cd5xfbOzII5N7HZFMYS6BuWZmdhkwBvgOmO2cO3dv58diMVeoOWVZ46234PTT4euv\noUMHH9xduvhgbdIkvbUsWAC9e/thnpkzfQ0iYWFmi51zsUrPizfQzWw/YDrwW2AL8DQwzTn3WJnz\nhgBDAJo3b/7zNWvWxHU9Cadt2+DHH/10wqCtWgWnnQZr1/o9VH/zm6ArEqmaqgZ6Ivf+TwJWO+eK\nnXM/AM8AP+n3OOcmOOdizrlYvgYws06tWpkR5uBnxbz5JsRicNZZfkPsoB6GEkmFRAJ9LdDJzOqY\nmQHdgRXJKUskNRo3hlde8b3z4cP91MYdO4KuSiQ54r4p6pxbaGbTgHeAHcC7wIRkFSaSKrVr+ydJ\nCwrg9tv9tMapU7UZtoRfQo9bOOducM61dM61ds71d85tS1ZhIqmUk+MfOPrb3+Cll/wUyw0bgq5K\nJDF6fk6y2sUX+1kvH30EnTrBBx8EXZFI/BTokvVOP92v8b59u59W+eqrQVckEh8Fugj+ydS33oKD\nD4ZTT/XTGkXCRoEuUuKQQ/wTrV27wn//N4wapWmNEi4KdJHdNGwIs2ZB//5w/fV+bZnS1R5FMp2W\nzxUpIy8PHnnEL+x1883w2WcwbRo0aBB0ZSJ7px66SDnM4KabYNIkv776L3/pg10kkynQRfZi4EA/\nT33NGj+tccmSoCsSqZgCXaQSJ53kN+DIyfE99Vmzgq5IpHwKdJEqaNMGFi70C3z16gUPPhh0RSI/\npUAXqaKDDoJ58/z2ekOGwMiRsHNn0FWJ7KJZLiLVUK8ePP88XHIJjB3r5623aQMHHLDnKz/fvzds\nmPydmEQqokAXqabcXBg/3m+b98AD8P77sGVL+efWrLkr3Evfywv+0lfduulti0RLQlvQVZe2oJOo\n2r7db3hdVLTnq7i4/M/ffFP+76lTp/ygL/18yinp375PglfVHYvUQxdJgrw8P8Z+0EFVO/+bb3aF\ne9nQLz32+ed+mmRR0a6nVZs0geeeg1/8InVtkfBSoIsEoG5d/yooqPxc52DrVlixAs4916/dPnky\nnH12ysuUkNEsF5EMZ+ZvrnbuDIsWQceOcM45cN11mmUje1Kgi4RI6Z6oF1wAo0f7za4rGo+X7KNA\nFwmZvDyYOBHuvBOeeQa6dYN164KuSjKBAl0khMzgT3/y2+etXOmHYRYtCroqCZoCXSTEevWCBQug\nVi1/s/Spp4KuSIKkQBcJudatfe88FoN+/eDGG3WzNFsp0EUiID/f3yw9/3y/jvvZZ8O33wZdlaSb\nAl0kImrV8htyjBsHTz/th2DWrw+6KkmnhALdzBqa2TQz+9DMVphZ52QVJiLVZwbDh8Ozz8KHH/qb\npYsXB12VpEuiPfR7gFnOuZZAO2BF4iWJSKJ69/YrQebm+k05nn466IokHeIOdDNrAHQDHgJwzm13\nzlWw5pyIpFvbtvD229C+vX8AadQov4yARFciPfRDgWJgspm9a2YTzUyLf4pkkAMOgDlzoH9/uP56\nv2TAd98FXZWkSiKBngt0AO53zrUHvgGuKnuSmQ0xs0IzKywuLk7gciISj9q14ZFH4NZb/Tz144+H\nDRuCrkpSIZFAXwesc84tLPk8DR/we3DOTXDOxZxzsfz8/AQuJyLxMoMRI/xSAcuXw7HHwrvvBl2V\nJFvcge6c2wh8ZmYtSg51B5YnpSoRSYkzzvA3S3NyoGtXH/ASHYnOcvkD8LiZvQ8cA9ySeEkikkrt\n2vknS9u2hb59YcwY3SyNioQ2uHDOLQEq3RZJRDLLgQfCq6/ChRfCtdf6zTMmTvTj7RJeelJUJEvV\nrg2PPup76I8/Dr/6FWzcGHRVkggFukgWM4ORI2H6dFi61D9ZumRJ0FVJvBToIsKZZ8Ibb/ix9C5d\n/NIBEj4KdBEB/BOlixZBmzY+4MeOhR9/DLoqqQ4Fuoj8W9Om/mZpv35+KKZ5c7j6ar8rkmQ+BbqI\n7GGfffxN0hkzoEMHuP12aNHCD8VMnAhffRV0hVIRBbqI/ISZfwjp+efhs8/8GuubN8Pgwb4XP2AA\nvPaadkbKNAp0Edmrpk39GusffOD3Lz3vPH/T9IQT4Igj4OabYc2aoKsUUKCLSBWZQadO8MADfnGv\nxx6Dww6DG26AQw+Fk0+GJ57Qao5BUqCLSLXVqQPnnuv3MV292of6qlX+WNOmcNFFsHChlhRINwW6\niCSkoMAH+iefwNy5frekKVN8b751a7jjDj2Bmi4KdBFJipwcP64+ZYoP8AkToEEDP/7erJkP+hkz\nYPv2oCuNLgW6iCRd/fp+Rsybb/qFv4YNg8JC/8DSwQfD0KHw/vtBVxk9CnQRSamWLf1uSWvXwgsv\n+B2T7rvPL+Mbi/mfN20KuspoUKCLSFrk5sLpp8O0afD553DPPX5pgUsv9TdS+/aFSZP8dxIfc2m8\nDR2LxVxhYWHaricime/dd2HyZL/iY2mYt2sHPXv6V+fOULNmsDUGzcwWO+cq3XtCgS4iGcE5P67+\n0kv+NX++78E3aAAnneTDvUcPPwafbRToIhJqW7f6ee6zZvmAX7/eH2/bdlfv/bjjsqP3rkAXkchw\nDpYt29V7f+MN2LHDz6Yp7b337Bnd3rsCXUQi66uvYM6cXQG/bp0/3qbNrnDv0iWY3vu33/qlEUpf\nGzf690GD4PDD4/udCnQRyQrO+YXDdu+9//AD1Ku3Z++9WbPErrFly55BXV5ob9hQ/vLCubkwc6av\nIx4KdBHJSl9/vWfv/bPP/PHWrffsvefl+ZuuRUXlB3PZwN627afXqlPHT7ks+zrwwD0/77+/f5I2\nXgp0Ecl6zsHy5bvC/fXXfe993339q6io/DXd99uv/KAuG9b16vlVKFOtqoGem/pSRESCYQZHH+1f\nw4b53vvcuTB7tu9xlxfYTZpA7dpBVx4fBbqIZI169aBPH/+KooQf/TezGmb2rpm9kIyCREQkPslY\ny+UyYEUSfo+IiCQgoUA3s2bA6cDE5JQjIiLxSrSHfjdwJVDh3t9mNsTMCs2ssLi4OMHLiYhIReIO\ndDPrBRQ55xbv7Tzn3ATnXMw5F8vPz4/3ciIiUolEeuhdgN5m9ikwFTjRzB5LSlUiIlJtcQe6c+5q\n51wz51wB0A+Y65w7L2mViYhItWjHIhGRiEjro/9mVgysSdsFE9MY+DLoIlIkym2DaLdPbQuvRNp3\niHOu0puQaQ30MDGzwqqsnRBGUW4bRLt9alt4paN9GnIREYkIBbqISEQo0Cs2IegCUijKbYNot09t\nC6+Ut09j6CIiEaEeuohIRCjQATObZGZFZrZst2ONzOxlM/u45H2/IGuMl5n9zMxeNbPlZvaBmV1W\ncjz07TOz2ma2yMzeK2nbTSXHQ9+2UmWXp45Y2z41s6VmtsTMCkuORaJ9ZtbQzKaZ2YdmtsLMOqej\nbQp072GgR5ljVwFznHNHAnNKPofRDuAK51wroBNwiZm1Ihrt2wac6JxrBxwD9DCzTkSjbaXKLk8d\npbYBnOCcO2a36XxRad89wCznXEugHf6fYerb5pzTy99HKACW7fb5I6Bpyc9NgY+CrjFJ7XwOODlq\n7QPqAO8Av4hK24BmJf/hnwi8UHIsEm0rqf9ToHGZY6FvH9AAWE3JPcp0tk099Io1cc5tKPl5I9Ak\nyGKSwcwKgPbAQiLSvpIhiSVAEfCycy4ybaP85amj0jYAB7xiZovNbEjJsSi071CgGJhcMlw20czq\nkoa2KdCrwPk/UkM9HcjM9gWmA5c7577a/bswt88596Nz7hh8b7ajmbUu830o21aV5anD2rbddC35\nZ9cTPxTYbfcvQ9y+XKADcL9zrj3wDWWGV1LVNgV6xb4ws6YAJe9FAdcTNzOriQ/zx51zz5Qcjkz7\nAJxzW4BX8fdCotC2ipanjkLbAHDOrS95LwJmAB2JRvvWAetK/m8RYBo+4FPeNgV6xWYCA0p+HoAf\new4dMzPgIWCFc+7Pu30V+vaZWb6ZNSz5eR/8vYEPiUDbXMXLU4e+bQBmVtfM6pX+DJwCLCMC7XPO\nbQQ+M7MWJYe6A8tJQ9v0YBFgZk8Cv8KvhvYFcAPwLPB3oDl+hciznHObgqoxXmbWFXgdWMqusdiR\n+HH0ULfPzNoCjwA18J2Tvzvnbjaz/Ql523ZnZr8ChjnnekWlbWZ2GL5XDn6I4gnn3JgIte8Y/F7L\necA/gYGU/DtKCtumQBcRiQgNuYiIRIQCXUQkIhToIiIRoUAXEYkIBbqISEQo0EVEIkKBLiISEQp0\nEZGI+H+oo8/loskQXQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x239b07c8780>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[ index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 0, 1, ..., 2, 2, 0])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mb_kmeans = MiniBatchKMeans(n_clusters = Best_K)\n",
    "\n",
    "y_prediction = mb_kmeans.fit_predict(X_data)\n",
    "\n",
    "y_prediction.shape\n",
    "\n",
    "y_prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "feat_names_Kmeans = \"Kmeans_\" + str(Best_K)\n",
    "\n",
    "y = pd.Series(data = Y_data, name = 'mark')\n",
    "train_kmeans = pd.concat([ pd.Series(name = feat_names_Kmeans, data = y_prediction), y], axis = 1)\n",
    "train_kmeans.to_csv('Text_Cluster_FE_train_KMeans.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
